Open In Colab

Validation¶

Validation of algorithms and transformation results.

Chap 7. Validação

  • Section 7.3 Resultados
    • Section 7.3.2 Semantic Annotation
      • Classificação de regras operativas, fatos, termos e nomes
    • Section 7.3.3 nlp2sbvr

Google colab¶

In [1]:
%load_ext autoreload
%autoreload 2

import sys

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
  !rm -rf cfr2sbvr configuration checkpoint
  !git clone https://github.com/asantos2000/master-degree-santos-anderson.git cfr2sbvr
  %pip install -r cfr2sbvr/code/requirements.txt
  !cp -r cfr2sbvr/code/src/configuration .
  !cp -r cfr2sbvr/code/src/checkpoint .
  !cp -r cfr2sbvr/code/config.colab.yaml config.yaml
  DEFAULT_CONFIG_FILE="config.yaml"
else:
  DEFAULT_CONFIG_FILE="../config.yaml"

Imports¶

In [2]:
# Standard library imports
import json
import os
import time
from datetime import datetime
from typing import List

# Local application/library-specific imports
import logging_setup.main as logging_setup
import matplotlib.pyplot as plt
import missingno as mi
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import rules_taxonomy_provider.main as rules_taxonomy_provider

# Third-party imports
import scipy.stats as stats
import seaborn as sns
import statsmodels.api as sm
from scipy.spatial.distance import cosine
from scipy.stats import kendalltau, spearmanr
from openai import OpenAI
from pydantic import BaseModel, Field

# Local modules
import configuration.main as configuration
import checkpoint.main as checkpoint
from checkpoint.main import (
  Document,
  get_all_checkpoints,
  get_elements_from_checkpoints,
  restore_checkpoint,
  save_checkpoint,
)
import llm_query.main as llm_query
from llm_query.main import query_instruct_llm
from rules_taxonomy_provider.main import RulesTemplateProvider

DEV_MODE = True

if DEV_MODE:
    # Development mode
    import importlib

    importlib.reload(configuration)
    importlib.reload(logging_setup)
    importlib.reload(checkpoint)
    importlib.reload(llm_query)
    importlib.reload(rules_taxonomy_provider)

# Ensure plots are displayed inline if using a Jupyter notebook
%matplotlib inline

from IPython.display import display

Settings¶

Configuration¶

In [3]:
# Load configuration
config = configuration.load_config(DEFAULT_CONFIG_FILE)

Logging¶

In [4]:
logger = logging_setup.setting_logging(config["DEFAULT_LOG_DIR"], config["LOG_LEVEL"])
2024-12-14 13:12:23 - INFO - Logging is set up with daily rotation.

Checkpoints¶

Restore the checkpoint¶

In [5]:
# Restore the checkpoint

# To run after extraction
last_checkpoint = configuration.get_last_filename(
    config["DEFAULT_CHECKPOINT_DIR"], "documents", "json"
)

logger.info(f"{last_checkpoint=}")

config["DEFAULT_CHECKPOINT_FILE"] = last_checkpoint

manager = restore_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"])
2024-12-14 13:12:23 - INFO - last_checkpoint='../data/checkpoints/documents-2024-12-08-10.json'
2024-12-14 13:12:23 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-10.json
2024-12-14 13:12:23 - INFO - Checkpoint restored from ../data/checkpoints/documents-2024-12-08-10.json.

General functions¶

In [6]:
def remove_section_symbol(input_string: str) -> str:
    """
    Removes the '§' symbol from the input string and trims whitespace.

    Args:
        input_string (str): The string from which to remove the '§' symbol.

    Returns:
        str: The cleaned string without the '§' symbol and leading/trailing whitespace.

    Raises:
        TypeError: If 'input_string' is not a string.
    """
    if not isinstance(input_string, str):
        raise TypeError("input_string must be a string")
    return input_string.replace("§", "").strip()
In [7]:
def prompt_analysis(raw_data, output_dir):
    # Create a DataFrame from the raw data
    data = pd.DataFrame(
        raw_data,
        columns=["filename", "doc_type", "elapsed_time", "usage", "created", "model"],
    )

    # Transform 'created' to a human-readable datetime format
    data["created"] = pd.to_datetime(data["created"], unit="s")

    # Extract relevant information from the 'usage' dictionary
    data["completion_tokens"] = data["usage"].apply(lambda x: x["completion_tokens"])
    data["prompt_tokens"] = data["usage"].apply(lambda x: x["prompt_tokens"])
    data["total_tokens"] = data["usage"].apply(lambda x: x["total_tokens"])

    # Define a function to get reference model context length
    def get_reference_model_context_length(model):
        return reference_models.get(
            model, 128_000
        )  # Default to 128,000 if model is unknown

    # Define a function to get the price per million tokens
    def get_price_per_million_tokens(model):
        return price_per_million_tokens.get(
            model, 2.50
        )  # Default to 2.50 if model is unknown

    # Add context length and price per million tokens columns
    data["reference_context_length"] = data["model"].apply(
        get_reference_model_context_length
    )
    data["price_per_million_tokens"] = data["model"].apply(get_price_per_million_tokens)

    # Overall Statistics
    total_tokens = data["total_tokens"].sum()
    num_samples = len(data)
    average_elapsed_time = data["elapsed_time"].mean()
    estimated_cost = (
        data["total_tokens"] / 1_000_000 * data["price_per_million_tokens"]
    ).sum()
    average_percentage_context_length = (
        data["total_tokens"] / data["reference_context_length"]
    ).mean() * 100
    min_created = data["created"].min().strftime("%Y-%m-%d %H:%M:%S")
    max_created = data["created"].max().strftime("%Y-%m-%d %H:%M:%S")

    # Add filename column to each statistic for origin tracking
    filename = file_info["filename"]
    # Data and time of the execution
    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Create Overall Statistics DataFrame
    overall_stats_df = pd.DataFrame(
        [
            {
                "Total Tokens": total_tokens,
                "Number of Samples": num_samples,
                "Average Elapsed Time (s)": average_elapsed_time,
                "Estimated Cost (USD)": estimated_cost,
                "Average Percentage of Context Length (%)": average_percentage_context_length,
                "Min Created Timestamp": min_created,
                "Max Created Timestamp": max_created,
                "origin": filename,
                "run_at": now,
            }
        ]
    )

    # Statistics by Sample Type (doc_type)
    stats_by_doc_type = (
        data.groupby("doc_type")
        .agg(
            total_tokens=("total_tokens", "sum"),
            num_samples=("doc_type", "count"),
            average_elapsed_time=("elapsed_time", "mean"),
            average_tokens=("total_tokens", "mean"),
            estimated_cost=(
                "total_tokens",
                lambda x: (x.sum() / 1_000_000)
                * data.loc[x.index, "price_per_million_tokens"].mean(),
            ),
            average_percentage_context_length=(
                "total_tokens",
                lambda x: (
                    x.mean() / data.loc[x.index, "reference_context_length"].mean()
                )
                * 100,
            ),
        )
        .reset_index()
    )
    stats_by_doc_type["filename"] = filename
    stats_by_doc_type["run_at"] = now

    # Statistics by Model
    stats_by_model = (
        data.groupby("model")
        .agg(
            total_tokens=("total_tokens", "sum"),
            num_samples=("model", "count"),
            average_elapsed_time=("elapsed_time", "mean"),
            average_tokens=("total_tokens", "mean"),
            average_percentage_context_length=(
                "total_tokens",
                lambda x: (x.mean() / get_reference_model_context_length(x.name)) * 100,
            ),
        )
        .reset_index()
    )
    stats_by_model["filename"] = filename
    stats_by_model["run_at"] = now

    # Add estimated cost and cost columns separately since they require different calculations
    def calculate_group_cost(model):
        price = get_price_per_million_tokens(model)
        total_tokens = data[data["model"] == model]["total_tokens"].sum()
        return (total_tokens / 1_000_000) * price

    stats_by_model["estimated_cost"] = stats_by_model["model"].apply(
        calculate_group_cost
    )
    stats_by_model["cost"] = stats_by_model["estimated_cost"]

    # Calculate Tokens per Second
    # Ensure there are no division by zero issues by filtering out zero elapsed times
    data = data[data["elapsed_time"] > 0]
    data["tokens_per_second"] = data["total_tokens"] / data["elapsed_time"]

    # Write the statistics to an Excel file
    file_name = os.path.join(output_dir, "prompt-analysis.xlsx")

    with pd.ExcelWriter(file_name, engine="openpyxl") as writer:
        # Replace the data on each sheet with the new data
        overall_stats_df.to_excel(writer, sheet_name="Overall Statistics", index=False)
        stats_by_doc_type.to_excel(
            writer, sheet_name="Statistics by Sample Type", index=False
        )
        stats_by_model.to_excel(writer, sheet_name="Statistics by Model", index=False)
        additional_stats_df = pd.DataFrame(
            [
                {
                    "Average Completion Tokens": data["completion_tokens"].mean(),
                    "Average Prompt Tokens": data["prompt_tokens"].mean(),
                    "Average Total Tokens per Sample": data["total_tokens"].mean(),
                    "Total Elapsed Time (s)": data["elapsed_time"].sum(),
                    "Average Tokens per Second": data["tokens_per_second"].mean(),
                    "origin": filename,
                    "run_at": now,
                }
            ]
        )
        additional_stats_df.to_excel(
            writer, sheet_name="Additional Statistics", index=False
        )
        data.to_excel(writer, sheet_name="Raw Data", index=False)

        # Explanation Page
        explanation_data = {
            "Sheet Name": [
                "Overall Statistics",
                "Statistics by Sample Type",
                "Statistics by Model",
                "Additional Statistics",
                "Raw Data",
            ],
            "Description": [
                "Summary statistics of the entire dataset, including total tokens, number of samples, average elapsed time, and estimated cost.",
                "Statistics broken down by sample type (doc_type), including the total number of tokens and cost estimates for each type.",
                "Statistics grouped by the model used, showing token utilization, cost, and elapsed time for each model.",
                "Additional aggregated metrics such as average completion tokens, prompt tokens, total tokens per sample, and processing time.",
                "The raw data used for generating all the statistics, including individual completions and their details.",
            ],
            "Columns Explained": [
                "Total Tokens: Total number of tokens processed. Number of Samples: Total number of samples. Average Elapsed Time (s): Average time taken for processing. Estimated Cost (USD): Estimated cost for token usage. Average Percentage of Context Length (%): Average percentage of used context length. Min and Max Created Timestamp: The time range of the data collected. Origin: Source filename.",
                "doc_type: Type of document. total_tokens: Sum of tokens per document type. num_samples: Number of samples of this type. average_elapsed_time: Average time taken per document type. average_tokens: Average tokens per sample. estimated_cost: Estimated cost for tokens of this type. average_percentage_context_length: Average percentage of context length used. filename: Source filename.",
                "model: Model name. total_tokens: Total number of tokens used by the model. num_samples: Number of samples processed by the model. average_elapsed_time: Average processing time for the model. average_tokens: Average number of tokens per sample. average_percentage_context_length: Average context length percentage used. filename: Source filename. estimated_cost/cost: Cost for the tokens used by the model.",
                "Average Completion Tokens: Average number of completion tokens per sample. Average Prompt Tokens: Average number of prompt tokens per sample. Average Total Tokens per Sample: Average number of total tokens per sample. Total Elapsed Time (s): Total processing time for all samples. Average Tokens per Second: Average number of tokens processed per second. origin: Source filename.",
                "filename: Source filename. doc_type: Type of document. elapsed_time: Time taken for each document. usage: Token usage details (completion and prompt). created: Timestamp of creation. model: Model used.",
            ],
        }
        explanation_df = pd.DataFrame(explanation_data)
        explanation_df.to_excel(writer, sheet_name="Explanation", index=False)

    # Display Overall Statistics
    overall_stats_df_display = pd.DataFrame(
        [
            {
                "Total Tokens": total_tokens,
                "Number of Samples": num_samples,
                "Average Elapsed Time (s)": average_elapsed_time,
                "Estimated Cost (USD)": estimated_cost,
                "Average Percentage of Context Length (%)": average_percentage_context_length,
                "Min Created Timestamp": min_created,
                "Max Created Timestamp": max_created,
                "origin": filename,
                "run_at": now,
            }
        ]
    )
    print("\nOverall Statistics:")
    print(overall_stats_df_display.to_string(index=False))

    # Display Statistics by Sample Type
    print("\nStatistics by Sample Type (doc_type):")
    print(stats_by_doc_type.to_string(index=False))

    # Display Statistics by Model
    print("\nStatistics by Model:")
    print(stats_by_model.to_string(index=False))

    # Additional Statistics
    additional_stats_df_display = pd.DataFrame(
        [
            {
                "Average Completion Tokens": data["completion_tokens"].mean(),
                "Average Prompt Tokens": data["prompt_tokens"].mean(),
                "Average Total Tokens per Sample": data["total_tokens"].mean(),
                "Total Elapsed Time (s)": data["elapsed_time"].sum(),
                "Average Tokens per Second": data["tokens_per_second"].mean(),
                "origin": filename,
                "run_at": now,
            }
        ]
    )
    print("\nAdditional Statistics:")
    print(additional_stats_df_display.to_string(index=False))
In [8]:
# Add similarity_classification based on similarity_score
def classify_similarity(score):
    if score == 1.0:
        return "identical"
    elif score >= 0.9:
        return "close-match"
    else:
        return "not-sure"


# Modify the highlight_similarity function to use three colors
def highlight_similarity(val):
    if val == "identical":
        color = "green"
    elif val == "close-match":
        color = "yellow"
    else:
        color = "red"
    return f"background-color: {color}"
In [9]:
def create_df_elements_results(similarity_elements_results):
    # Build the dataframe
    df_results = pd.DataFrame(similarity_elements_results)

    df_results["similarity_classification"] = df_results["similarity_score"].apply(
        classify_similarity
    )

    df_results["classification_match"] = (
        df_results["classification_pred"] == df_results["classification_true"]
    )
    df_results["classification_match_label"] = df_results["classification_match"].map(
        {True: "match", False: "mismatch"}
    )

    df_results["source_match"] = df_results["source_pred"] == df_results["source_true"]
    df_results["source_match_label"] = df_results["source_match"].map(
        {True: "match", False: "mismatch"}
    )

    df_results["id_match"] = df_results["id_pred"] == df_results["id_true"]
    df_results["id_match_label"] = df_results["id_match"].map(
        {True: "match", False: "mismatch"}
    )

    return df_results
In [10]:
class JudgeStatement(BaseModel):
    doc_id: str = Field(..., description="Document ID associated with the statement.")
    statement_id: str = Field(
        ...,
        description="A provided string that identifies the statement. e.g., '1', 'Person'.",
    )
    statement: str = Field(..., description="The statement to be transformed.")
    sources: List[str] = Field(..., description="Sources of the statement.")
    semscore: float = Field(..., description="just a copy from input semscore.")
    similarity_score: float = Field(
        ...,
        description="Similarity score between the original and transformed sentences.",
    )
    similarity_score_confidence: float = Field(
        ..., description="Confidence score for the similarity score."
    )
    transformation_accuracy: float = Field(
        ..., description="Accuracy score for the transformation."
    )
    grammar_syntax_accuracy: float = Field(
        ..., description="Accuracy score for the grammar and syntax."
    )
    findings: List[str] = Field(..., description="List of findings.")


class JudgeStatements(BaseModel):
    JudgeStatements: List[JudgeStatement] = Field(
        ..., description="List of judge statements."
    )
In [11]:
def get_prompts_for_judge(rules, data_dir):
    rule_template_provider = RulesTemplateProvider(data_dir)

    system_prompts = []
    user_prompts = []

    for rule in rules:
        element_name = rule.get("element_name")

        if element_name == ["Term", "Name"]:
            statement_key = "definition"
            statement_id_key = "signifier"
        else:
            statement_key = "statement"
            statement_id_key = "statement_id"

        user_prompt = get_user_prompt_judge_sentence_similarity(element_name, rule)
        user_prompts.append(user_prompt)
        rule_templates_subtemplates = rule_template_provider.get_rules_template(
            rule["templates_ids"]
        )
        system_prompt = get_system_prompt_judge_sentence_similarity(
            rule_templates_subtemplates
        )
        system_prompts.append(system_prompt)
        logger.debug(system_prompt)
        logger.debug(user_prompt)

    logger.info(f"System prompts for {element_name}s: {len(system_prompts)}")
    logger.info(f"User prompts for {element_name}s: {len(user_prompts)}")

    return system_prompts, user_prompts, element_name
In [12]:
def evaluate_statement(element_name, user_prompts, system_prompts, manager):
    # Initialize an empty list to accumulate all responses
    all_responses = []
    elapse_times = []
    completions = []

    # Loop through each pair of user and system prompts with a counter
    for index, (user_prompt, system_prompt) in enumerate(
        zip(user_prompts, system_prompts), start=1
    ):
        logger.info(f"Processing evaluation prompt {index} for {element_name}.")
        logger.debug(system_prompt)
        logger.debug(user_prompt)

        # Query the language model
        response, completion, elapse_time = query_instruct_llm(
            system_prompt=system_prompt,
            user_prompt=user_prompt,
            document_model=JudgeStatements,
            llm_model=config["LLM"]["MODEL"],
            temperature=config["LLM"]["TEMPERATURE"],
            max_tokens=config["LLM"]["MAX_TOKENS"],
        )

        logger.debug(f"{response}")

        # Accumulate the responses in the list
        all_responses.extend(response.JudgeStatements)
        elapse_times.append(elapse_time)
        completions.append(completion.dict())

        logger.info(f"Finished processing evaluation {index}.")

        logger.info("Waiting 2s before processing the next prompt to avoid rate limits")
        time.sleep(2)

    # After the loop, create a single Document with all the accumulated responses
    doc = Document(
        id=f"validation_judge_{element_name.replace(' ', '_')}s",
        type="llm_validation",
        content=all_responses,
        elapsed_times=elapse_times,
        completions=completions,
    )
    manager.add_document(doc)

    logger.info(f"{element_name}s: {len(all_responses)}")

    return all_responses
In [13]:
def get_embedding(text, model="text-embedding-3-large"):
    client = OpenAI()
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding


def cosine_similarity(embedding1, embedding2):
    # Calcula a similaridade de cosseno entre dois embeddings
    embedding1 = np.array(embedding1)
    embedding2 = np.array(embedding2)
    cos_sim = np.dot(embedding1, embedding2) / (
        np.linalg.norm(embedding1) * np.linalg.norm(embedding2)
    )
    return cos_sim


def compare_sentences(sentence1, sentence2):
    # Obtem embeddings para as duas frases
    embedding1 = get_embedding(sentence1)
    embedding2 = get_embedding(sentence2)
    # Calcula a similaridade de cosseno entre os embeddings
    # similarity = cosine_similarity(embedding1, embedding2)
    similarity = 1 - cosine(embedding1, embedding2)
    return similarity
In [14]:
# Function to calculate Intraclass Correlation Coefficient (ICC) using statsmodels
def calculate_icc(data):
    """
    Calculate the Intraclass Correlation Coefficient (ICC) using statsmodels.
    data: pandas DataFrame with 'semscore' and 'similarity_score' columns.
    """
    data = data[["semscore", "similarity_score"]].dropna().reset_index(drop=True)
    data["subject"] = data.index
    ratings = data.melt(id_vars=["subject"], var_name="rater", value_name="score")
    model = sm.MixedLM.from_formula(
        "score ~ 1", groups="subject", re_formula="1", data=ratings
    )
    result = model.fit()
    var_components = result.cov_re.iloc[0, 0]
    residual = result.scale
    icc_value = var_components / (var_components + residual)
    return icc_value


# Function to calculate descriptive statistics
def calculate_descriptive_stats(series):
    stats = series.describe()
    stats["range"] = stats["max"] - stats["min"]
    stats["IQR"] = stats["75%"] - stats["25%"]
    return stats


# Function to plot Bland-Altman plot
def plot_bland_altman(df, score1, score2, title, output_dir, filename):
    mean_scores = (df[score1] + df[score2]) / 2
    diff_scores = df[score1] - df[score2]
    diff_mean = diff_scores.mean()
    diff_std = diff_scores.std()
    loa_upper = diff_mean + 1.96 * diff_std
    loa_lower = diff_mean - 1.96 * diff_std

    plt.figure(figsize=(8, 6))
    plt.scatter(mean_scores, diff_scores, alpha=0.7)
    plt.axhline(diff_mean, color="red", linestyle="--")
    plt.axhline(loa_upper, color="grey", linestyle="--")
    plt.axhline(loa_lower, color="grey", linestyle="--")
    plt.xlabel("Mean of Scores")
    plt.ylabel("Difference of Scores")
    plt.title(title)
    plt.grid(True)
    plt.show()

    # Save plot
    plot_path = os.path.join(output_dir, filename)
    plt.savefig(plot_path)
    plt.close()
    return plot_path


# Function to plot scatter plot
def plot_scatter(df, x_col, y_col, title, xlabel, ylabel, output_dir, filename):
    plt.figure(figsize=(8, 6))
    plt.scatter(df[x_col], df[y_col], alpha=0.7)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.grid(True)
    plt.show()

    # Save plot
    plot_path = os.path.join(output_dir, filename)
    plt.savefig(plot_path)
    plt.close()
    return plot_path


# Function to plot histogram
def plot_histogram(series, title, xlabel, output_dir, filename):
    plt.figure(figsize=(8, 6))
    series.hist(bins=20)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

    # Save plot
    plot_path = os.path.join(output_dir, filename)
    plt.savefig(plot_path)
    plt.close()
    return plot_path


# Function to plot box plot
def plot_boxplot(series, title, ylabel, output_dir, filename):
    plt.figure(figsize=(8, 6))
    series.plot.box()
    plt.title(title)
    plt.ylabel(ylabel)
    plt.grid(True)
    plt.show()

    # Save plot
    plot_path = os.path.join(output_dir, filename)
    plt.savefig(plot_path)
    plt.close()
    return plot_path


# Function to plot density plot
def plot_density(series, title, xlabel, output_dir, filename):
    plt.figure(figsize=(8, 6))
    series.plot(kind="kde")
    plt.title(title)
    plt.xlabel(xlabel)
    plt.grid(True)
    plt.show()

    # Save plot
    plot_path = os.path.join(output_dir, filename)
    plt.savefig(plot_path)
    plt.close()
    return plot_path


# Function to count scores above a threshold
def count_scores_above_threshold(series, threshold):
    count = (series > threshold).sum()
    return count


# Function to plot heatmap
def plot_heatmap(df, title, output_dir, filename):
    plt.figure(figsize=(10, 8))
    sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
    plt.title(title)
    plt.show()

    # Save plot
    plot_path = os.path.join(output_dir, filename)
    plt.savefig(plot_path, bbox_inches="tight")
    plt.close()
    return plot_path

# Function to plot Q-Q plot
def plot_qqplot(series, title, output_dir, filename):
    plt.figure(figsize=(8, 6))
    
    # Gerar o Q-Q plot
    stats.probplot(series, dist="norm", plot=plt)
    plt.title(title)
    plt.grid(True)
    
    # Exibir o gráfico
    plt.show()

    # Salvar o gráfico
    plot_path = os.path.join(output_dir, filename)
    plt.savefig(plot_path)
    plt.close()
    return plot_path


# Main function to process all elements
def process_all_elements(element_data, output_dir):
    logger.info(f"Processing All Elements\n{'-'*40}")

    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Define the path for the Excel file
    excel_file_path = os.path.join(output_dir, "combined_analysis_results.xlsx")

    # Create an Excel writer using XlsxWriter as the engine
    writer = pd.ExcelWriter(excel_file_path, engine="xlsxwriter")
    workbook = writer.book

    # List to store DataFrames for combining data later
    combined_df_list = []

    # List to keep track of image filenames for cleanup
    image_files = []

    # Loop through each element type in element_data
    for element_name, content in element_data.items():
        element_type = element_name  # e.g., 'Operative_Rules'
        logger.info(f"\nProcessing Element Type: {element_type}")

        df = pd.DataFrame(content)

        # Ensure numeric columns are of float type
        numeric_cols = [
            "semscore",
            "similarity_score",
            "similarity_score_confidence",
            "transformation_accuracy",
            "grammar_syntax_accuracy",
        ]
        df[numeric_cols] = df[numeric_cols].astype(float)

        # Add a column for element type
        df["element_type"] = element_type

        # Append to combined data list
        combined_df_list.append(df)

        # Perform analysis on this element type
        sheet_name = element_type[:31]  # Sheet names have a max length of 31 characters
        worksheet = workbook.add_worksheet(sheet_name)
        writer.sheets[sheet_name] = worksheet

        # Reset row counter for each sheet
        row = 0

        # Descriptive Statistics
        semscore_stats = calculate_descriptive_stats(df["semscore"])
        similarity_score_stats = calculate_descriptive_stats(df["similarity_score"])

        # Correlations
        pearson_corr = df["semscore"].corr(df["similarity_score"])
        spearman_corr = df["semscore"].corr(df["similarity_score"], method="spearman")

        # Display statistics in the notebook
        print("Semscore Statistics:")
        display(semscore_stats.to_frame())
        print("\nSimilarity Score Statistics:")
        display(similarity_score_stats.to_frame())
        print(
            f"\nPearson Correlation between semscore and similarity_score: {pearson_corr:.4f}"
        )
        print(
            f"Spearman Correlation between semscore and similarity_score: {spearman_corr:.4f}"
        )

        # Write Semscore Statistics to Excel
        semscore_stats_df = semscore_stats.to_frame(name="Semscore Statistics")
        semscore_stats_df.to_excel(
            writer, sheet_name=sheet_name, startrow=row, startcol=0
        )
        row += len(semscore_stats_df) + 3  # Increment row for next section

        # Write Similarity Score Statistics to Excel
        similarity_score_stats_df = similarity_score_stats.to_frame(
            name="Similarity Score Statistics"
        )
        similarity_score_stats_df.to_excel(
            writer, sheet_name=sheet_name, startrow=row, startcol=0
        )
        row += len(similarity_score_stats_df) + 3

        # Write Correlations to Excel
        worksheet.write(
            row, 0, "Pearson Correlation between semscore and similarity_score"
        )
        worksheet.write(row, 1, pearson_corr)
        row += 1
        worksheet.write(
            row, 0, "Spearman Correlation between semscore and similarity_score"
        )
        worksheet.write(row, 1, spearman_corr)
        row += 3

        # Intraclass Correlation Coefficient (ICC)
        icc_value = calculate_icc(df[["semscore", "similarity_score"]])
        print(
            f"\nIntraclass Correlation Coefficient (ICC) between semscore and similarity_score: {icc_value:.4f}"
        )
        worksheet.write(row, 0, "Intraclass Correlation Coefficient (ICC)")
        worksheet.write(row, 1, icc_value)
        row += 3

        # Q-Q Plot for similarity_score
        plot_filename = f"qqplot_similarity_score_{element_type}.png"
        plot_path = plot_qqplot(
            df["similarity_score"],
            f"Q-Q Plot of similarity_score - {element_type}",
            output_dir,
            plot_filename,
        )

        # Bland-Altman Plot
        plot_filename = f"bland_altman_{element_type}.png"
        plot_path = plot_bland_altman(
            df,
            "semscore",
            "similarity_score",
            f"Bland-Altman Plot - {element_type}",
            output_dir,
            plot_filename,
        )
        worksheet.insert_image(row, 0, plot_path)
        row += 20
        image_files.append(plot_path)

        # Scatter Plot
        plot_filename = f"scatter_semscore_similarity_{element_type}.png"
        plot_path = plot_scatter(
            df,
            "semscore",
            "similarity_score",
            f"Semscore vs Similarity Score - {element_type}",
            "Semscore",
            "Similarity Score",
            output_dir,
            plot_filename,
        )
        worksheet.insert_image(row, 0, plot_path)
        row += 20
        image_files.append(plot_path)

        # Histograms for semscore
        plot_filename = f"histogram_semscore_{element_type}.png"
        plot_path = plot_histogram(
            df["semscore"],
            f"Histogram of Semscore - {element_type}",
            "Semscore",
            output_dir,
            plot_filename,
        )
        worksheet.insert_image(row, 0, plot_path)
        row += 20
        image_files.append(plot_path)

        # Box Plot for semscore
        plot_filename = f"boxplot_semscore_{element_type}.png"
        plot_path = plot_boxplot(
            df["semscore"],
            f"Box Plot of Semscore - {element_type}",
            "Semscore",
            output_dir,
            plot_filename,
        )
        worksheet.insert_image(row, 0, plot_path)
        row += 20
        image_files.append(plot_path)

        # Box Plot for similarity_score
        plot_filename = f"boxplot_similarity_score_{element_type}.png"
        plot_path = plot_boxplot(
            df["similarity_score"],
            f"Box Plot of similarity_score - {element_type}",
            "similarity_score",
            output_dir,
            plot_filename,
        )
        worksheet.insert_image(row, 0, plot_path)
        row += 20
        image_files.append(plot_path)

        # Density Plot for semscore
        plot_filename = f"density_semscore_{element_type}.png"
        plot_path = plot_density(
            df["semscore"],
            f"Density Plot of Semscore - {element_type}",
            "Semscore",
            output_dir,
            plot_filename,
        )
        worksheet.insert_image(row, 0, plot_path)
        row += 20
        image_files.append(plot_path)

        # Density Plot for similarity_score
        plot_filename = f"density_similarity_score_{element_type}.png"
        plot_path = plot_density(
            df["similarity_score"],
            f"Density Plot of similarity_score - {element_type}",
            "similarity_score",
            output_dir,
            plot_filename,
        )
        worksheet.insert_image(row, 0, plot_path)
        row += 20
        image_files.append(plot_path)

        # Counts of Scores Above Threshold
        threshold = 0.8
        count_above_threshold = count_scores_above_threshold(
            df["transformation_accuracy"], threshold
        )
        print(
            f"\nCount of Transformation Accuracy scores above {threshold}: {count_above_threshold}"
        )
        worksheet.write(row, 0, f"Count of Transformation Accuracy > {threshold}")
        worksheet.write(row, 1, count_above_threshold)
        row += 3

        # Heatmap of Correlation Matrix
        plot_filename = f"heatmap_{element_type}.png"
        plot_path = plot_heatmap(
            df[numeric_cols],
            f"Correlation Matrix Heatmap - {element_type}",
            output_dir,
            plot_filename,
        )
        worksheet.insert_image(row, 0, plot_path)
        row += 25
        image_files.append(plot_path)

        # Save the DataFrame with the original data to a separate sheet
        data_sheet_name = f"{sheet_name}_Data"[:31]
        df.to_excel(writer, sheet_name=data_sheet_name, index=False)

    # Combine all DataFrames
    combined_df = pd.concat(combined_df_list, ignore_index=True)

    # Perform combined analysis
    print("\nProcessing Combined Data")
    sheet_name = "Combined_Analysis"
    worksheet = workbook.add_worksheet(sheet_name)
    writer.sheets[sheet_name] = worksheet
    row = 0

    # Ensure numeric columns are of float type
    combined_df[numeric_cols] = combined_df[numeric_cols].astype(float)

    # Descriptive Statistics for combined semscore
    semscore_stats = calculate_descriptive_stats(combined_df["semscore"])
    similarity_score_stats = calculate_descriptive_stats(
        combined_df["similarity_score"]
    )

    # Correlations
    pearson_corr = combined_df["semscore"].corr(combined_df["similarity_score"])
    spearman_corr = combined_df["semscore"].corr(
        combined_df["similarity_score"], method="spearman"
    )

    # Display statistics in the notebook
    print("Combined Semscore Statistics:")
    display(semscore_stats.to_frame())
    print("\nCombined Similarity Score Statistics:")
    display(similarity_score_stats.to_frame())
    print(
        f"\nCombined Pearson Correlation between semscore and similarity_score: {pearson_corr:.4f}"
    )
    print(
        f"Combined Spearman Correlation between semscore and similarity_score: {spearman_corr:.4f}"
    )

    # Write Semscore Statistics to Excel
    semscore_stats_df = semscore_stats.to_frame(name="Combined Semscore Statistics")
    semscore_stats_df.to_excel(writer, sheet_name=sheet_name, startrow=row, startcol=0)
    row += len(semscore_stats_df) + 3

    # Write Similarity Score Statistics to Excel
    similarity_score_stats_df = similarity_score_stats.to_frame(
        name="Combined Similarity Score Statistics"
    )
    similarity_score_stats_df.to_excel(
        writer, sheet_name=sheet_name, startrow=row, startcol=0
    )
    row += len(similarity_score_stats_df) + 3

    # Write Correlations to Excel
    worksheet.write(
        row, 0, "Combined Pearson Correlation between semscore and similarity_score"
    )
    worksheet.write(row, 1, pearson_corr)
    row += 1
    worksheet.write(
        row, 0, "Combined Spearman Correlation between semscore and similarity_score"
    )
    worksheet.write(row, 1, spearman_corr)
    row += 3

    # Intraclass Correlation Coefficient (ICC) for Combined Data
    icc_value = calculate_icc(combined_df[["semscore", "similarity_score"]])
    print(
        f"\nCombined Intraclass Correlation Coefficient (ICC) between semscore and similarity_score: {icc_value:.4f}"
    )
    worksheet.write(row, 0, "Combined Intraclass Correlation Coefficient (ICC)")
    worksheet.write(row, 1, icc_value)
    row += 3

    # Bland-Altman Plot for Combined Data
    plot_filename = "combined_bland_altman.png"
    plot_path = plot_bland_altman(
        combined_df,
        "semscore",
        "similarity_score",
        "Bland-Altman Plot - Combined Data",
        output_dir,
        plot_filename,
    )
    worksheet.insert_image(row, 0, plot_path)
    row += 20
    image_files.append(plot_path)

    # Scatter Plot for Combined Data
    plot_filename = "scatter_semscore_similarity_combined.png"
    plot_path = plot_scatter(
        combined_df,
        "semscore",
        "similarity_score",
        "Semscore vs Similarity Score - Combined Data",
        "Semscore",
        "Similarity Score",
        output_dir,
        plot_filename,
    )
    worksheet.insert_image(row, 0, plot_path)
    row += 20
    image_files.append(plot_path)

    # Histograms for combined semscore
    plot_filename = "histogram_semscore_combined.png"
    plot_path = plot_histogram(
        combined_df["semscore"],
        "Histogram of Semscore - Combined Data",
        "Semscore",
        output_dir,
        plot_filename,
    )
    worksheet.insert_image(row, 0, plot_path)
    row += 20
    image_files.append(plot_path)

    # Box Plot for combined semscore
    plot_filename = "boxplot_semscore_combined.png"
    plot_path = plot_boxplot(
        combined_df["semscore"],
        "Box Plot of Semscore - Combined Data",
        "Semscore",
        output_dir,
        plot_filename,
    )
    worksheet.insert_image(row, 0, plot_path)
    row += 20
    image_files.append(plot_path)

    # Density Plot for combined semscore
    plot_filename = "density_semscore_combined.png"
    plot_path = plot_density(
        combined_df["semscore"],
        "Density Plot of Semscore - Combined Data",
        "Semscore",
        output_dir,
        plot_filename,
    )
    worksheet.insert_image(row, 0, plot_path)
    row += 20
    image_files.append(plot_path)

# Histograms for combined similarity_score
    plot_filename = "histogram_similarity_score_combined.png"
    plot_path = plot_histogram(
        combined_df["similarity_score"],
        "Histogram of similarity_score - Combined Data",
        "similarity_score",
        output_dir,
        plot_filename,
    )
    worksheet.insert_image(row, 0, plot_path)
    row += 20
    image_files.append(plot_path)

    # Box Plot for combined similarity_score
    plot_filename = "boxplot_similarity_score_combined.png"
    plot_path = plot_boxplot(
        combined_df["similarity_score"],
        "Box Plot of similarity_score - Combined Data",
        "similarity_score",
        output_dir,
        plot_filename,
    )
    worksheet.insert_image(row, 0, plot_path)
    row += 20
    image_files.append(plot_path)

    # Density Plot for combined similarity_score
    plot_filename = "density_similarity_score_combined.png"
    plot_path = plot_density(
        combined_df["similarity_score"],
        "Density Plot of similarity_score - Combined Data",
        "similarity_score",
        output_dir,
        plot_filename,
    )
    worksheet.insert_image(row, 0, plot_path)
    row += 20
    image_files.append(plot_path)

    # Counts of Scores Above Threshold in Combined Data
    threshold = 0.8
    count_above_threshold = count_scores_above_threshold(
        combined_df["transformation_accuracy"], threshold
    )
    print(
        f"\nCombined Count of Transformation Accuracy scores above {threshold}: {count_above_threshold}"
    )
    worksheet.write(row, 0, f"Combined Count of Transformation Accuracy > {threshold}")
    worksheet.write(row, 1, count_above_threshold)
    row += 3

    # Heatmap of Correlation Matrix for Combined Data
    plot_filename = "heatmap_combined.png"
    plot_path = plot_heatmap(
        combined_df[numeric_cols],
        "Correlation Matrix Heatmap - Combined Data",
        output_dir,
        plot_filename,
    )
    worksheet.insert_image(row, 0, plot_path)
    row += 25
    image_files.append(plot_path)

    # Save the combined DataFrame to a separate sheet
    combined_df.to_excel(writer, sheet_name="Combined_Data", index=False)

    # Close the writer and save the Excel file
    writer.close()
    print(f"Analysis saved to '{excel_file_path}'")

    # Clean up the plot images after saving the workbook
    for image_file in image_files:
        if os.path.exists(image_file):
            os.remove(image_file)

    return combined_df

Datasets¶

From section 7.2.4 Datasets

The dataset of the previous algorithm was adjusted with the gold standard dataset. The goal is to reduce the accumulation of errors from one step to the next.

The data adjusted:

  • § 275.0-2_P1, § 275.0-2_P2
  • § 275.0-5_P1, § 275.0-5_P2
  • § 275.0-7_P1, § 275.0-7_P2

True tables¶

There are no true tables to evaluate the transformation, the evaluation depends on the algorithms SEMSCORE and "LLM as a Judge".

Predicted values¶

Get predicted elements from all runs

In [15]:
(
    pred_operative_rules_classify,
    pred_facts_classify,
    pred_terms_classify,
    pred_names_classify,
    pred_files_classify,
) = get_elements_from_checkpoints(
    config["DEFAULT_CHECKPOINT_DIR"], merge=True, filter="non_null"
)
2024-12-14 13:12:24 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-1.json
2024-12-14 13:12:24 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-10.json
2024-12-14 13:12:24 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-2.json
2024-12-14 13:12:24 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-3.json
2024-12-14 13:12:24 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-4.json
2024-12-14 13:12:24 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-5.json
2024-12-14 13:12:24 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-6.json
2024-12-14 13:12:24 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-7.json
2024-12-14 13:12:24 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-8.json
2024-12-14 13:12:24 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-9.json
2024-12-14 13:12:24 - INFO - Rules to evaluate: 60
2024-12-14 13:12:24 - INFO - Facts to evaluate: 160
2024-12-14 13:12:24 - INFO - Terms to evaluate: 280
2024-12-14 13:12:24 - INFO - Names to evaluate: 50
[{'filename': 'documents-2024-12-08-1.json', 'date': '2024-12-08', 'number': 1}, {'filename': 'documents-2024-12-08-10.json', 'date': '2024-12-08', 'number': 10}, {'filename': 'documents-2024-12-08-2.json', 'date': '2024-12-08', 'number': 2}, {'filename': 'documents-2024-12-08-3.json', 'date': '2024-12-08', 'number': 3}, {'filename': 'documents-2024-12-08-4.json', 'date': '2024-12-08', 'number': 4}, {'filename': 'documents-2024-12-08-5.json', 'date': '2024-12-08', 'number': 5}, {'filename': 'documents-2024-12-08-6.json', 'date': '2024-12-08', 'number': 6}, {'filename': 'documents-2024-12-08-7.json', 'date': '2024-12-08', 'number': 7}, {'filename': 'documents-2024-12-08-8.json', 'date': '2024-12-08', 'number': 8}, {'filename': 'documents-2024-12-08-9.json', 'date': '2024-12-08', 'number': 9}]

Set dataset to evaluation and check empty transformed elements

In [16]:
data = (
    pred_facts_classify,
    pred_terms_classify,
    pred_names_classify,
    pred_operative_rules_classify,
)
data_names = ("pred_facts", "pred_terms", "pred_names", "pred_operative_rules")

for element_list, element_name in zip(data, data_names):
    empty_transformed_elements = [
        element for element in element_list if not element.get("transformed")
    ]
    logger.info(
        f"Empty transformed {element_name}: {len(empty_transformed_elements)}/{len(element_list)}"
    )
2024-12-14 13:12:24 - INFO - Empty transformed pred_facts: 0/160
2024-12-14 13:12:24 - INFO - Empty transformed pred_terms: 0/280
2024-12-14 13:12:24 - INFO - Empty transformed pred_names: 0/50
2024-12-14 13:12:24 - INFO - Empty transformed pred_operative_rules: 0/60
2024-12-14 13:12:24 - INFO - Empty transformed pred_terms: 0/280
2024-12-14 13:12:24 - INFO - Empty transformed pred_names: 0/50
2024-12-14 13:12:24 - INFO - Empty transformed pred_operative_rules: 0/60
In [17]:
for element_list, element_name in zip(data, data_names):
    element_df = pd.DataFrame(element_list)
    mi.matrix(element_df, figsize=(10, 5))
    plt.title(f"Missing Values for {element_name}")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Algorithms¶

Validation of algorithm from section 6.2 Implementation of main components

Source for section 7.3 Results

nlp2sbvr¶

Elements measurements from chapter 7.2.3 Terms, names, facts, and operative rules

Measuring similarity with SEMSCORE¶

Evaluating SEMSCORE (AYNETDINOV;AKBIK, 2024) between the predicted and true statements for each element.

WARNING: Expensive operation!

If the data is available could skip processing evaluation. Operation is expensive, if just need to compile the evaluation, set SKIP to True.

In [18]:
SKIP = True
In [19]:
if not SKIP:
    for element_list, data_name in zip(data, data_names):
        for element in element_list:
            # Check if 'semscore' is already calculated
            if "semscore" not in element or element["semscore"] is None:
                original_sentence = element.get("statement", element.get("definition"))
                transformed_sentence = element.get("transformed")
                templates_ids = element.get("templates_ids")
                element_name = element.get("element_name")

                # Remove keys if they exist
                for key in [
                    "explanation",
                    "confidence",
                    "subtype_confidence",
                    "subtype_explanation",
                ]:
                    element.pop(key, None)  # Using pop with None to avoid KeyError

                logger.debug(f"{element_name=}")
                logger.debug(
                    f"{data_name} - {element['statement_id']}: {element['doc_id']}{element['sources']}\nOriginal Sentence: {original_sentence}\nTransformed Sentence: {transformed_sentence}\ntemplates: {templates_ids}\n"
                )
                logger.debug(f"{element=}")

                # Calculate similarity score
                similarity = compare_sentences(original_sentence, transformed_sentence)
                logger.info(
                    f"element: {element_name}, similarity score: {similarity}\n"
                )

                # Assign the calculated score to 'semscore'
                element["semscore"] = similarity
                logger.debug(f"element: {element}\n")
            else:
                logger.debug(
                    f"{element.get('element_name')} already has a semscore: {element['semscore']}"
                )

Check if SEMSCORE was calculated.

In [20]:
# Check for 'semscore' in each list of dictionaries
semscore_in_operative_rules = all(
    "semscore" in item and item["semscore"] is not None
    for item in pred_operative_rules_classify
)
semscore_in_facts = all(
    "semscore" in item and item["semscore"] is not None for item in pred_facts_classify
)
semscore_in_terms = all(
    "semscore" in item and item["semscore"] is not None for item in pred_terms_classify
)
semscore_in_names = all(
    "semscore" in item and item["semscore"] is not None for item in pred_names_classify
)

# Log information including whether 'semscore' exists and is not None
logger.info(
    f"Operative Rules to evaluate: {len(pred_operative_rules_classify)}, semscore was calculated: {semscore_in_operative_rules}"
)
logger.info(
    f"Facts to evaluate: {len(pred_facts_classify)}, semscore was calculated: {semscore_in_facts}"
)
logger.info(
    f"Terms to evaluate: {len(pred_terms_classify)}, semscore was calculated: {semscore_in_terms}"
)
logger.info(
    f"Names to evaluate: {len(pred_names_classify)}, semscore was calculated: {semscore_in_names}"
)
2024-12-14 13:12:26 - INFO - Operative Rules to evaluate: 60, semscore was calculated: True
2024-12-14 13:12:26 - INFO - Facts to evaluate: 160, semscore was calculated: True
2024-12-14 13:12:26 - INFO - Terms to evaluate: 280, semscore was calculated: True
2024-12-14 13:12:26 - INFO - Names to evaluate: 50, semscore was calculated: True
2024-12-14 13:12:26 - INFO - Facts to evaluate: 160, semscore was calculated: True
2024-12-14 13:12:26 - INFO - Terms to evaluate: 280, semscore was calculated: True
2024-12-14 13:12:26 - INFO - Names to evaluate: 50, semscore was calculated: True

Evaluation criterias (SHANKAR et al., 2024)¶

Based on the prompt, there are three inferred evaluation criteria that align with the approach proposed by EvalGen (SHANKAR et al., 2024):

  1. Similarity Score

    • Given the original_sentence and tranformed_sentence, how similar are they from 0 to 1? And how confident are you about your estimation from 0 to 1?
  2. Transformation Accuracy

    • From 0 to 1, how does the "transformed_sentence" reflect the original_sentence with the structure and phrasing provided by the template?
  3. Grammar and Syntax Accuracy

    • How is the transformed sentence grammatically correct and syntactically accurate from 0 to 1?

LLM-as-a-judge¶

References of the LLM-as-a-judge approach: (WEI; CHEN; LUO, 2024), (DONG; HU; COLLIER, 2024), (ZHENG et al., 2023)

Prompt engineering¶

System prompt

In [21]:
def get_system_prompt_judge_sentence_similarity(template):
    return f"""
   # Task

   You're an expert in judging sentence similarity and transformation using a template. 

   These criteria should support the evaluation process by verifying classification accuracy, template application, and transformation fidelity.

   Check the criteria and evaluate the output:

   1. **Similarity Score**
      - Given the statement or definition and tranformed sentence (transformed), how similar are they from 0 to 1? And how confident are you about your estimation from 0 to 1?

   2. **Transformation Accuracy**
      - From 0 to 1, how does the transformed sentence (transformed) reflect the original sentence (statement or definition) with the structure and phrasing provided by the template and subtemplates?

   3. **Grammar and Syntax Accuracy**
      - How is the transformed sentence (transformed) grammatically correct and syntactically accurate from 0 to 1?

   # Output Format

   Record your evaluation in JSON format as follows:

   ```json
   {{
      "doc_id": "<Document ID>",
      "statement_id": "<Statement ID>",
      "sources": ["<source>"],
      "similarity_score": <Similarity score>,
      "similarity_score_confidence": <Confidence score>,
      "transformation_accuracy": <Transformation score>,
      "grammar_syntax_accuracy": <Grammar score>,
      "findings": ["<Things found during the evaluation and worth to be mentioned>", 
                  "<other things to mention>"
                  ],
      "semscore": <original semscore>
   }}
   ```

   # Input example

   {{
      "doc_id": <Document ID>,
      "statement_id": <Statement ID>,
      "statement or definition": <original sentence>,
      "sources": [<source>],
      "terms": [
         {{"term": <signifier>, "classification": <Proper or Common Noun>}},
         ...
      ],
      "verb_symbols": <verbs or phrasal verbs>,
      "element_name": <name of element: Name, Term, Fact, Fact Type, Operative Rule>,
      "transformed": <transformed sentence>,
      "type": <type of element: Definitional, Activity, Party, Data>,
      "subtype": <subtype of element>,
      "templates_ids": ["T8"],
      "semscore": <semscore>
   }}

   # Templates and Subtemplates

   {template}
   """

User prompt

In [22]:
def get_user_prompt_judge_sentence_similarity(element_name, rule):
    return f"""
# rule data for an element: {element_name}

{json.dumps(rule, indent=2)}
    """
Measuring similarity with LLM Judge¶

Preparing system and user prompts for each element and call the judge.

In [23]:
if not SKIP:
    for element_list, data_name in zip(data, data_names):

        system_prompts, user_prompts, element_name = get_prompts_for_judge(
            element_list, config["DEFAULT_DATA_DIR"]
        )

        responses = evaluate_statement(
            element_name=element_name,
            user_prompts=user_prompts,
            system_prompts=system_prompts,
            manager=manager,
        )

        logger.debug(f"{responses=}")

        # Persist the state to a file
        save_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"], manager=manager)

Average similarity score per document 5s.

Elements evaluation¶

In [24]:
managers, file_info_list = get_all_checkpoints(config["DEFAULT_CHECKPOINT_DIR"])
2024-12-14 13:12:26 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-1.json
2024-12-14 13:12:26 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-10.json
2024-12-14 13:12:26 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-2.json
2024-12-14 13:12:26 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-3.json
2024-12-14 13:12:26 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-4.json
2024-12-14 13:12:26 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-5.json
2024-12-14 13:12:26 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-6.json
2024-12-14 13:12:26 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-7.json
2024-12-14 13:12:26 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-8.json
2024-12-14 13:12:26 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-9.json
In [25]:
eval_operative_rules = []
eval_facts = []
eval_terms = []
eval_names = []

for manager, file_info in zip(managers, file_info_list):
    # Process documents
    eval_operative_rules.extend(manager.retrieve_document(
        "validation_judge_Operative_Rules", "llm_validation"
    ).content)
    eval_names.extend(manager.retrieve_document(
        "validation_judge_Names", "llm_validation"
    ).content)
    eval_terms.extend(manager.retrieve_document(
        "validation_judge_Terms", "llm_validation"
    ).content)
    eval_facts.extend(manager.retrieve_document(
        "validation_judge_Fact_Types", "llm_validation"
    ).content)

logger.info(f"Operative Rules: {len(eval_operative_rules)}")
logger.info(f"Names: {len(eval_names)}")
logger.info(f"Terms: {len(eval_terms)}")
logger.info(f"Facts: {len(eval_facts)}")
2024-12-14 13:12:26 - INFO - Operative Rules: 60
2024-12-14 13:12:26 - INFO - Names: 50
2024-12-14 13:12:26 - INFO - Terms: 280
2024-12-14 13:12:26 - INFO - Facts: 160
2024-12-14 13:12:26 - INFO - Names: 50
2024-12-14 13:12:26 - INFO - Terms: 280
2024-12-14 13:12:26 - INFO - Facts: 160
In [26]:
#manager = restore_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"])

elements_data = {
    "Operative_Rules": eval_operative_rules,
    "Names": eval_names,
    "Terms": eval_terms,
    "Fact_Types": eval_facts,
}
In [27]:
for key in elements_data.keys():
    logger.info(f"{key}: {len(elements_data[key])}")
2024-12-14 13:12:26 - INFO - Operative_Rules: 60
2024-12-14 13:12:26 - INFO - Names: 50
2024-12-14 13:12:26 - INFO - Terms: 280
2024-12-14 13:12:26 - INFO - Fact_Types: 160
2024-12-14 13:12:26 - INFO - Names: 50
2024-12-14 13:12:26 - INFO - Terms: 280
2024-12-14 13:12:26 - INFO - Fact_Types: 160

Checking missing data

In [28]:
for element_key in elements_data.keys():
    element_df = pd.DataFrame(elements_data[key])
    mi.matrix(element_df, figsize=(10, 5))
    plt.title(f"Missing Values for {key}")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Process all metrics

In [29]:
df = process_all_elements(elements_data, config["DEFAULT_OUTPUT_DIR"])
2024-12-14 13:12:27 - INFO - Processing All Elements
----------------------------------------
2024-12-14 13:12:27 - INFO - 
Processing Element Type: Operative_Rules
Semscore Statistics:
semscore
count 60.000000
mean 0.940934
std 0.017991
min 0.904715
25% 0.921666
50% 0.945994
75% 0.952143
max 0.961199
range 0.056484
IQR 0.030477
Similarity Score Statistics:
similarity_score
count 60.000000
mean 0.905833
std 0.036928
min 0.850000
25% 0.900000
50% 0.900000
75% 0.950000
max 0.950000
range 0.100000
IQR 0.050000
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:1634: UserWarning: Random effects covariance is singular
  warnings.warn(msg)
Pearson Correlation between semscore and similarity_score: 0.1253
Spearman Correlation between semscore and similarity_score: 0.1281

Intraclass Correlation Coefficient (ICC) between semscore and similarity_score: 0.0000
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:1634: UserWarning: Random effects covariance is singular
  warnings.warn(msg)
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2237: ConvergenceWarning: The MLE may be on the boundary of the parameter space.
  warnings.warn(msg, ConvergenceWarning)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Count of Transformation Accuracy scores above 0.8: 33
No description has been provided for this image
2024-12-14 13:12:29 - INFO - 
Processing Element Type: Names
Semscore Statistics:
semscore
count 50.000000
mean 0.800785
std 0.057943
min 0.678210
25% 0.785631
50% 0.820770
75% 0.850337
max 0.850504
range 0.172295
IQR 0.064706
Similarity Score Statistics:
similarity_score
count 50.000000
mean 0.940000
std 0.020203
min 0.900000
25% 0.950000
50% 0.950000
75% 0.950000
max 0.950000
range 0.050000
IQR 0.000000
Pearson Correlation between semscore and similarity_score: 0.1313
Spearman Correlation between semscore and similarity_score: 0.3465
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  warnings.warn("Maximum Likelihood optimization failed to "
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2200: ConvergenceWarning: Retrying MixedLM optimization with lbfgs
  warnings.warn(
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  warnings.warn("Maximum Likelihood optimization failed to "
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2200: ConvergenceWarning: Retrying MixedLM optimization with cg
  warnings.warn(
Intraclass Correlation Coefficient (ICC) between semscore and similarity_score: 0.1691
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  warnings.warn("Maximum Likelihood optimization failed to "
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2206: ConvergenceWarning: MixedLM optimization failed, trying a different optimizer may help.
  warnings.warn(msg, ConvergenceWarning)
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2218: ConvergenceWarning: Gradient optimization failed, |grad| = 24.388920
  warnings.warn(msg, ConvergenceWarning)
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2237: ConvergenceWarning: The MLE may be on the boundary of the parameter space.
  warnings.warn(msg, ConvergenceWarning)
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2261: ConvergenceWarning: The Hessian matrix at the estimated parameter values is not positive definite.
  warnings.warn(msg, ConvergenceWarning)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Count of Transformation Accuracy scores above 0.8: 50
No description has been provided for this image
2024-12-14 13:12:32 - INFO - 
Processing Element Type: Terms
Semscore Statistics:
semscore
count 280.000000
mean 0.798593
std 0.068700
min 0.654959
25% 0.755620
50% 0.800653
75% 0.848153
max 0.950956
range 0.295996
IQR 0.092533
Similarity Score Statistics:
similarity_score
count 280.000000
mean 0.920000
std 0.048564
min 0.500000
25% 0.900000
50% 0.950000
75% 0.950000
max 1.000000
range 0.500000
IQR 0.050000
Pearson Correlation between semscore and similarity_score: -0.0952
Spearman Correlation between semscore and similarity_score: -0.1062
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  warnings.warn("Maximum Likelihood optimization failed to "
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2200: ConvergenceWarning: Retrying MixedLM optimization with lbfgs
  warnings.warn(
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  warnings.warn("Maximum Likelihood optimization failed to "
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2200: ConvergenceWarning: Retrying MixedLM optimization with cg
  warnings.warn(
Intraclass Correlation Coefficient (ICC) between semscore and similarity_score: 0.1209
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  warnings.warn("Maximum Likelihood optimization failed to "
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2206: ConvergenceWarning: MixedLM optimization failed, trying a different optimizer may help.
  warnings.warn(msg, ConvergenceWarning)
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2218: ConvergenceWarning: Gradient optimization failed, |grad| = 102.678696
  warnings.warn(msg, ConvergenceWarning)
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2237: ConvergenceWarning: The MLE may be on the boundary of the parameter space.
  warnings.warn(msg, ConvergenceWarning)
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2261: ConvergenceWarning: The Hessian matrix at the estimated parameter values is not positive definite.
  warnings.warn(msg, ConvergenceWarning)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Count of Transformation Accuracy scores above 0.8: 204
No description has been provided for this image
2024-12-14 13:12:40 - INFO - 
Processing Element Type: Fact_Types
Semscore Statistics:
semscore
count 160.000000
mean 0.906189
std 0.054946
min 0.726641
25% 0.865581
50% 0.919224
75% 0.953123
max 0.993932
range 0.267291
IQR 0.087542
Similarity Score Statistics:
similarity_score
count 160.000000
mean 0.918125
std 0.058944
min 0.700000
25% 0.900000
50% 0.950000
75% 0.950000
max 0.950000
range 0.250000
IQR 0.050000
Pearson Correlation between semscore and similarity_score: -0.0440
Spearman Correlation between semscore and similarity_score: 0.0241

Intraclass Correlation Coefficient (ICC) between semscore and similarity_score: 0.0000
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2237: ConvergenceWarning: The MLE may be on the boundary of the parameter space.
  warnings.warn(msg, ConvergenceWarning)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Count of Transformation Accuracy scores above 0.8: 121
No description has been provided for this image
Processing Combined Data
Combined Semscore Statistics:
semscore
count 550.000000
mean 0.845621
std 0.083549
min 0.654959
25% 0.781917
50% 0.848162
75% 0.919216
max 0.993932
range 0.338973
IQR 0.137299
Combined Similarity Score Statistics:
similarity_score
count 550.000000
mean 0.919727
std 0.049468
min 0.500000
25% 0.900000
50% 0.950000
75% 0.950000
max 1.000000
range 0.500000
IQR 0.050000
Combined Pearson Correlation between semscore and similarity_score: -0.1070
Combined Spearman Correlation between semscore and similarity_score: -0.0679
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  warnings.warn("Maximum Likelihood optimization failed to "
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2200: ConvergenceWarning: Retrying MixedLM optimization with lbfgs
  warnings.warn(
/home/adsantos/miniconda3/envs/ipt-cfr2sbvr/lib/python3.11/site-packages/statsmodels/regression/mixed_linear_model.py:2237: ConvergenceWarning: The MLE may be on the boundary of the parameter space.
  warnings.warn(msg, ConvergenceWarning)
Combined Intraclass Correlation Coefficient (ICC) between semscore and similarity_score: 0.0236
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Combined Count of Transformation Accuracy scores above 0.8: 408
No description has been provided for this image
Analysis saved to '../outputs/combined_analysis_results.xlsx'

Metrics¶

In [108]:
# Updated functions for side-by-side plotting and color customization

# Function to plot histogram for semscore and similarity_score side-by-side
def plot_histogram_side_by_side(df, title, xlabel, output_dir, filename):
    plt.figure(figsize=(12, 6))
    plt.hist(df["semscore"], bins=20, color="#D55E00", alpha=0.7, label="Semscore", linestyle="--", edgecolor="black")
    plt.hist(df["similarity_score"], bins=20, color="#0072B2", alpha=0.7, label="Similarity Score", linestyle="-", edgecolor="black")
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel("Frequency")
    plt.legend()
    plt.grid(True)
    plt.show()

    plot_path = os.path.join(output_dir, filename)
    plt.savefig(plot_path)
    plt.close()
    return plot_path

# Function to plot box plot for semscore and similarity_score side-by-side
def plot_boxplot_side_by_side(df, title, ylabel, output_dir, filename):
    plt.figure(figsize=(8, 6))
    boxplot = plt.boxplot(
        [df["semscore"].dropna(), df["similarity_score"].dropna()],
        labels=["Semscore", "Similarity Score"],
        patch_artist=True,
        boxprops=dict(color="black"),
        medianprops=dict(color="black"),
        capprops=dict(color="black"),
        whiskerprops=dict(color="black"),
    )
    colors = ["#D55E00", "#0072B2"]
    for patch, color in zip(boxplot['boxes'], colors):
        patch.set_facecolor(color)
    plt.title(title)
    plt.ylabel(ylabel)
    plt.grid(True)
    plt.show()

    plot_path = os.path.join(output_dir, filename)
    plt.savefig(plot_path)
    plt.close()
    return plot_path

# Function to plot density plot for semscore and similarity_score side-by-side
def plot_density_side_by_side(df, title, xlabel, output_dir, filename):
    plt.figure(figsize=(12, 6))
    df["semscore"].plot(kind="kde", color="#D55E00", alpha=0.7, linestyle="--", label="Semscore")
    df["similarity_score"].plot(kind="kde", color="#0072B2", alpha=0.7, linestyle="-", label="Similarity Score")
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel("Density")
    plt.legend()
    plt.grid(True)
    plt.show()

    plot_path = os.path.join(output_dir, filename)
    plt.savefig(plot_path)
    plt.close()
    return plot_path

# Updated process_all_elements function

def process_all_elements_updated(element_data, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    excel_file_path = os.path.join(output_dir, "combined_analysis_results.xlsx")
    writer = pd.ExcelWriter(excel_file_path, engine="xlsxwriter")
    workbook = writer.book
    combined_df_list = []
    image_files = []

    for element_name, content in element_data.items():
        df = pd.DataFrame(content)
        numeric_cols = ["semscore", "similarity_score"]
        df[numeric_cols] = df[numeric_cols].astype(float)
        df["element_type"] = element_name
        combined_df_list.append(df)
        sheet_name = element_name[:31]
        worksheet = workbook.add_worksheet(sheet_name)
        writer.sheets[sheet_name] = worksheet
        row = 0

        # Histograms side-by-side
        plot_filename = f"histogram_side_by_side_{element_name}.png"
        plot_path = plot_histogram_side_by_side(
            df,
            f"Histograms of Semscore and Similarity Score - {element_name}",
            "Scores",
            output_dir,
            plot_filename,
        )
        worksheet.insert_image(row, 0, plot_path)
        row += 20
        image_files.append(plot_path)

        # Boxplots side-by-side
        plot_filename = f"boxplot_side_by_side_{element_name}.png"
        plot_path = plot_boxplot_side_by_side(
            df,
            f"Boxplots of Semscore and Similarity Score - {element_name}",
            "Scores",
            output_dir,
            plot_filename,
        )
        worksheet.insert_image(row, 0, plot_path)
        row += 20
        image_files.append(plot_path)

        # Density plots side-by-side
        plot_filename = f"density_side_by_side_{element_name}.png"
        plot_path = plot_density_side_by_side(
            df,
            f"Density Plots of Semscore and Similarity Score - {element_name}",
            "Scores",
            output_dir,
            plot_filename,
        )
        worksheet.insert_image(row, 0, plot_path)
        row += 20
        image_files.append(plot_path)

    combined_df = pd.concat(combined_df_list, ignore_index=True)

    # Combined Histograms side-by-side
    plot_filename = "histogram_side_by_side_combined.png"
    plot_path = plot_histogram_side_by_side(
        combined_df,
        "Combined Histograms of Semscore and Similarity Score",
        "Scores",
        output_dir,
        plot_filename,
    )
    writer.sheets["Combined"] = workbook.add_worksheet("Combined")
    writer.sheets["Combined"].insert_image(0, 0, plot_path)
    image_files.append(plot_path)

    # Combined Boxplots side-by-side
    plot_filename = "boxplot_side_by_side_combined.png"
    plot_path = plot_boxplot_side_by_side(
        combined_df,
        "Combined Boxplots of Semscore and Similarity Score",
        "Scores",
        output_dir,
        plot_filename,
    )
    writer.sheets["Combined"].insert_image(25, 0, plot_path)
    image_files.append(plot_path)

    # Combined Density Plots side-by-side
    plot_filename = "density_side_by_side_combined.png"
    plot_path = plot_density_side_by_side(
        combined_df,
        "Combined Density Plots of Semscore and Similarity Score",
        "Scores",
        output_dir,
        plot_filename,
    )
    writer.sheets["Combined"].insert_image(50, 0, plot_path)
    image_files.append(plot_path)

    writer.close()
    for image_file in image_files:
        if os.path.exists(image_file):
            os.remove(image_file)
    return combined_df
In [109]:
combined_df = process_all_elements_updated(elements_data, config["DEFAULT_OUTPUT_DIR"])
No description has been provided for this image
/tmp/ipykernel_4986/1576221141.py:23: MatplotlibDeprecationWarning:

The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
/tmp/ipykernel_4986/1576221141.py:23: MatplotlibDeprecationWarning:

The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
/tmp/ipykernel_4986/1576221141.py:23: MatplotlibDeprecationWarning:

The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
/tmp/ipykernel_4986/1576221141.py:23: MatplotlibDeprecationWarning:

The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
/tmp/ipykernel_4986/1576221141.py:23: MatplotlibDeprecationWarning:

The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.

No description has been provided for this image
No description has been provided for this image

Describing the metrics semscore and similarity_score

In [120]:
combined_df.groupby("element_type")[["semscore", "similarity_score"]].describe()#.to_excel(config["DEFAULT_OUTPUT_DIR"] + "/sem_sim_descriptive_stats.xlsx")
Out[120]:
semscore similarity_score
count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max
element_type
Fact_Types 160.0 0.906189 0.054946 0.726641 0.865581 0.919224 0.953123 0.993932 160.0 0.918125 0.058944 0.70 0.90 0.95 0.95 0.95
Names 50.0 0.800785 0.057943 0.678210 0.785631 0.820770 0.850337 0.850504 50.0 0.940000 0.020203 0.90 0.95 0.95 0.95 0.95
Operative_Rules 60.0 0.940934 0.017991 0.904715 0.921666 0.945994 0.952143 0.961199 60.0 0.905833 0.036928 0.85 0.90 0.90 0.95 0.95
Terms 280.0 0.798593 0.068700 0.654959 0.755620 0.800653 0.848153 0.950956 280.0 0.920000 0.048564 0.50 0.90 0.95 0.95 1.00

See correlation analysis below

Similarity_score and confidence

In [76]:
combined_df.groupby("element_type")[["similarity_score", "similarity_score_confidence"]].describe()
Out[76]:
similarity_score similarity_score_confidence
count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max
element_type
Fact_Types 160.0 0.918125 0.058944 0.70 0.90 0.95 0.95 0.95 160.0 0.892188 0.023490 0.80 0.90 0.9 0.9 0.9
Names 50.0 0.940000 0.020203 0.90 0.95 0.95 0.95 0.95 50.0 0.891000 0.019404 0.85 0.90 0.9 0.9 0.9
Operative_Rules 60.0 0.905833 0.036928 0.85 0.90 0.90 0.95 0.95 60.0 0.882500 0.024050 0.85 0.85 0.9 0.9 0.9
Terms 280.0 0.920000 0.048564 0.50 0.90 0.95 0.95 1.00 280.0 0.884821 0.033760 0.70 0.85 0.9 0.9 1.0
In [77]:
# Calculate correlation by element_type
combined_df.groupby("element_type").apply(
    lambda group: group["similarity_score"].corr(group["similarity_score_confidence"])
).reset_index(name="correlation")
/tmp/ipykernel_4986/3731941340.py:2: DeprecationWarning:

DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.

Out[77]:
element_type correlation
0 Fact_Types 0.647984
1 Names 0.937043
2 Operative_Rules 0.116892
3 Terms 0.628518

transformation_accuracy and grammar_syntax_accuracy

In [78]:
combined_df.groupby("element_type")[["transformation_accuracy", "grammar_syntax_accuracy"]].describe()
Out[78]:
transformation_accuracy grammar_syntax_accuracy
count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max
element_type
Fact_Types 160.0 0.869688 0.086892 0.60 0.85 0.90 0.9 0.95 160.0 0.933750 0.080006 0.60 0.95 0.95 0.95 1.00
Names 50.0 0.900000 0.010102 0.85 0.90 0.90 0.9 0.95 50.0 0.970000 0.024744 0.95 0.95 0.95 1.00 1.00
Operative_Rules 60.0 0.851667 0.050394 0.80 0.80 0.85 0.9 0.95 60.0 0.941667 0.018791 0.90 0.95 0.95 0.95 0.95
Terms 280.0 0.875536 0.075608 0.30 0.80 0.90 0.9 1.00 280.0 0.952143 0.074745 0.20 0.95 0.95 1.00 1.00
In [79]:
# Calculate correlation by element_type
combined_df.groupby("element_type").apply(
    lambda group: group["transformation_accuracy"].corr(group["grammar_syntax_accuracy"])
).reset_index(name="correlation")
/tmp/ipykernel_4986/1485779397.py:2: DeprecationWarning:

DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.

Out[79]:
element_type correlation
0 Fact_Types 0.756493
1 Names 0.204124
2 Operative_Rules 0.462378
3 Terms 0.764044

Correlation analysis similarity_score and semscore¶

Top 10 lowest semscore

In [80]:
# Make a copy of the DataFrame for further analysis
df_aval = df.copy()

df_similarity = df.copy()

df_agree = df.copy()
In [81]:
df_aval.nsmallest(10, ['semscore'])
Out[81]:
doc_id statement_id statement sources semscore similarity_score similarity_score_confidence transformation_accuracy grammar_syntax_accuracy findings element_type
112 § 275.0-2 Managing agent Any person, including a trustee, who directs o... [(b)(1)] 0.654959 0.95 0.9 0.95 1.00 [The transformed sentence accurately reflects ... Terms
224 § 275.0-2 Managing agent Any person, including a trustee, who directs o... [(b)(1)] 0.655024 0.95 0.9 0.95 1.00 [The transformed sentence accurately reflects ... Terms
168 § 275.0-2 Managing agent Any person, including a trustee, who directs o... [(b)(1)] 0.655041 0.95 0.9 0.90 1.00 [The transformed sentence accurately reflects ... Terms
308 § 275.0-2 Managing agent Any person, including a trustee, who directs o... [(b)(1)] 0.655048 0.95 0.9 0.90 1.00 [The transformed sentence accurately reflects ... Terms
252 § 275.0-2 Managing agent Any person, including a trustee, who directs o... [(b)(1)] 0.655050 0.95 0.9 0.90 1.00 [The transformed sentence accurately reflects ... Terms
140 § 275.0-2 Managing agent Any person, including a trustee, who directs o... [(b)(1)] 0.655103 0.95 0.9 0.90 0.95 [The transformed sentence accurately reflects ... Terms
336 § 275.0-2 Managing agent Any person, including a trustee, who directs o... [(b)(1)] 0.655213 0.95 0.9 0.95 1.00 [The transformed sentence accurately reflects ... Terms
196 § 275.0-2 Managing agent Any person, including a trustee, who directs o... [(b)(1)] 0.655215 0.95 0.9 0.95 1.00 [The transformed sentence accurately reflects ... Terms
280 § 275.0-2 Managing agent Any person, including a trustee, who directs o... [(b)(1)] 0.655292 0.95 0.9 0.90 1.00 [The transformed sentence accurately reflects ... Terms
364 § 275.0-2 Managing agent Any person, including a trustee, who directs o... [(b)(1)] 0.655489 0.95 0.9 0.95 1.00 [The transformed sentence accurately reflects ... Terms

Top 10 lowest similarity_score

In [82]:
df_smallest=df_aval.nsmallest(100, ['similarity_score', "semscore"])
In [83]:
df_smallest
Out[83]:
doc_id statement_id statement sources semscore similarity_score similarity_score_confidence transformation_accuracy grammar_syntax_accuracy findings element_type
296 § 275.0-5 Order of the Commission An order issued by the Commission under the Act. [(d)] 0.810312 0.5 0.70 0.3 0.20 [The transformed sentence 'An order is by defi... Terms
249 § 275.0-7 Trust A person is presumed to control a trust if the... [(b)(1)(iv)] 0.851620 0.7 0.80 0.6 0.70 [The transformed sentence introduces the term ... Terms
333 § 275.0-7 Trust A person is presumed to control a trust if the... [(b)(1)(iv)] 0.917104 0.7 0.80 0.6 0.50 [The transformed sentence changes the meaning ... Terms
502 § 275.0-2 1 A person may serve process, pleadings, or othe... [(a)] 0.934140 0.7 0.80 0.6 0.90 [The transformed sentence introduces 'by defin... Fact_Types
390 § 275.0-2 1 A person may serve process, pleadings, or othe... [(a)] 0.934150 0.7 0.80 0.6 0.70 [The transformed sentence does not accurately ... Fact_Types
... ... ... ... ... ... ... ... ... ... ... ...
177 § 275.0-5 Facts Information submitted to the Commission bearin... [(a)] 0.759489 0.9 0.85 0.8 0.95 [The transformed sentence maintains the core m... Terms
345 § 275.0-5 Facts Information submitted to the Commission bearin... [(a)] 0.759489 0.9 0.85 0.8 0.95 [The transformed sentence maintains the core m... Terms
289 § 275.0-5 Facts Information submitted to the Commission bearin... [(a)] 0.759561 0.9 0.85 0.8 0.95 [The transformed sentence accurately reflects ... Terms
121 § 275.0-5 Facts Information submitted to the Commission bearin... [(a)] 0.759566 0.9 0.85 0.8 0.95 [The transformed sentence maintains the core m... Terms
261 § 275.0-5 Facts Information submitted to the Commission bearin... [(a)] 0.760273 0.9 0.85 0.8 0.95 [The transformed sentence maintains the core m... Terms

100 rows × 11 columns

In [84]:
df_smallest=df_aval.nsmallest(100, ['similarity_score', "semscore"])['score_difference'] = df_similarity['similarity_score'] - df_similarity['semscore']
In [85]:
# Convert the 'sources' column to a string type to allow dropping duplicates
df_aval['sources'] = df_aval['sources'].apply(str)

# Filter the distinct records based on doc_id, statement_id, statement, and sources
df_aval.drop_duplicates(subset=['doc_id', 'statement_id', 'statement', 'sources'])
Out[85]:
doc_id statement_id statement sources semscore similarity_score similarity_score_confidence transformation_accuracy grammar_syntax_accuracy findings element_type
0 § 275.0-2 3 The Secretary of the Commission (Secretary) wi... ['(a)(2)'] 0.952598 0.90 0.85 0.8 0.95 [The transformed sentence maintains the core m... Operative_Rules
1 § 275.0-2 4 If the Secretary certifies that the Commission... ['(a)(3)'] 0.944126 0.95 0.90 0.9 0.95 [The transformed sentence maintains the origin... Operative_Rules
2 § 275.0-5 1 Notice of the initiation of the proceeding wil... ['(a)'] 0.946623 0.95 0.90 0.9 0.95 [The transformed sentence maintains the origin... Operative_Rules
3 § 275.0-5 2 Any interested person may, within the period o... ['(a)'] 0.914574 0.85 0.90 0.8 0.95 [The transformed sentence captures the essence... Operative_Rules
4 § 275.0-5 3 An order disposing of the matter will be issue... ['(b)'] 0.958645 0.90 0.85 0.8 0.90 [The transformed sentence maintains the core m... Operative_Rules
... ... ... ... ... ... ... ... ... ... ... ...
405 § 275.0-7 10 Total assets means the total assets as shown o... ['(b)(2)'] 0.976969 0.95 0.90 0.9 0.95 [The transformed sentence closely follows the ... Fact_Types
415 § 275.0-7 4 An investment adviser does not control, is not... ['(a)(3)'] 0.963108 0.95 0.90 0.9 0.95 [The transformed sentence maintains the origin... Fact_Types
461 \n§ 275.0-7 2 An investment adviser has assets under managem... ['(a)(1)'] 0.889467 0.95 0.90 0.9 0.95 [The transformed sentence accurately reflects ... Fact_Types
462 \n§ 275.0-7 3 An investment adviser did not have total asset... ['(a)(2)'] 0.842859 0.95 0.90 0.9 1.00 [The transformed sentence accurately reflects ... Fact_Types
469 \n§ 275.0-7 10 Total assets means the total assets as shown o... ['(b)(2)'] 0.976953 0.95 0.90 0.9 0.95 [The transformed sentence closely follows the ... Fact_Types

61 rows × 11 columns

In [86]:
df_similarity['score_difference'] = df_similarity['similarity_score'] - df_similarity['semscore']
In [87]:
df_similarity
Out[87]:
doc_id statement_id statement sources semscore similarity_score similarity_score_confidence transformation_accuracy grammar_syntax_accuracy findings element_type score_difference
0 § 275.0-2 3 The Secretary of the Commission (Secretary) wi... [(a)(2)] 0.952598 0.90 0.85 0.8 0.95 [The transformed sentence maintains the core m... Operative_Rules -0.052598
1 § 275.0-2 4 If the Secretary certifies that the Commission... [(a)(3)] 0.944126 0.95 0.90 0.9 0.95 [The transformed sentence maintains the origin... Operative_Rules 0.005874
2 § 275.0-5 1 Notice of the initiation of the proceeding wil... [(a)] 0.946623 0.95 0.90 0.9 0.95 [The transformed sentence maintains the origin... Operative_Rules 0.003377
3 § 275.0-5 2 Any interested person may, within the period o... [(a)] 0.914574 0.85 0.90 0.8 0.95 [The transformed sentence captures the essence... Operative_Rules -0.064574
4 § 275.0-5 3 An order disposing of the matter will be issue... [(b)] 0.958645 0.90 0.85 0.8 0.90 [The transformed sentence maintains the core m... Operative_Rules -0.058645
... ... ... ... ... ... ... ... ... ... ... ... ...
545 § 275.0-7 6 A person is presumed to control a corporation ... [(b)(1)(i)(A)] 0.785793 0.90 0.85 0.8 0.95 [The transformed sentence maintains the core m... Fact_Types 0.114207
546 § 275.0-7 7 A person is presumed to control a partnership ... [(b)(1)(ii)] 0.938356 0.95 0.90 0.9 0.95 [The transformed sentence maintains the origin... Fact_Types 0.011644
547 § 275.0-7 8 A person is presumed to control a limited liab... [(b)(1)(iii)] 0.953968 0.95 0.90 0.9 0.95 [The transformed sentence maintains the origin... Fact_Types -0.003968
548 § 275.0-7 9 A person is presumed to control a trust if the... [(b)(1)(iv)] 0.917051 0.80 0.80 0.6 0.70 [The transformed sentence incorrectly suggests... Fact_Types -0.117051
549 § 275.0-7 10 Total assets means the total assets as shown o... [(b)(2)] 0.976969 0.95 0.90 0.9 0.95 [The transformed sentence closely follows the ... Fact_Types -0.026969

550 rows × 12 columns

In [111]:
# Plot the semscore, similarity score, and score difference on the same graph
plt.figure(figsize=(12, 6))

plt.plot(df_similarity.index, df_similarity['semscore'], color='#D55E00', marker='x', linestyle='--', label='Semscore')
plt.plot(df_similarity.index, df_similarity['similarity_score'], color='#0072B2', marker='o', linestyle='-', label='Similarity Score')

plt.title('Semscore, and Similarity Score Across Records')
plt.xlabel('Record Index')
plt.ylabel('Scores')
plt.grid(True)
plt.legend()
plt.show()
No description has been provided for this image
In [89]:
# Plot the score difference as a line chart
plt.figure(figsize=(10, 6))
plt.plot(df_similarity.index, df_similarity['score_difference'], marker='o', linestyle='-', label='Score Difference')
plt.title('Score Difference Across Records')
plt.xlabel('Record Index')
plt.ylabel('Score Difference')
plt.grid(True)
plt.legend()
plt.show()
No description has been provided for this image
In [118]:
# Create an interactive scatter plot
fig = go.Figure()

marker_map = {
    'Operative_Rules': 'circle',
    'Names': 'x',
    'Terms': 'triangle-up',
    'Fact_Types': 'diamond'
}

# Add a trace for each element_type
unique_types = df_similarity['element_type'].unique()
for etype in unique_types:
    filtered_data = df_similarity[df_similarity['element_type'] == etype]
    fig.add_trace(go.Scatter(
        x=filtered_data.index,
        y=filtered_data['score_difference'],
        mode='lines+markers',
        marker=dict(symbol=marker_map[etype]),  # Wrap the symbol in a dictionary
        name=etype,
        visible=True  # Ensure all traces are visible initially
    ))

# Add dropdown to filter by element_type
dropdown_buttons = [
    dict(label="All",
         method="update",
         args=[{"visible": [True] * len(unique_types)},  # Show all traces
               {"title": "Score Difference - All Element Types"}]),
]

for i, etype in enumerate(unique_types):
    dropdown_buttons.append(
        dict(label=etype,
             method="update",
             args=[{"visible": [j == i for j in range(len(unique_types))]},  # Show only the selected trace
                   {"title": f"Score Difference - {etype}"}])
    )

fig.update_layout(
    updatemenus=[
        dict(
            buttons=dropdown_buttons,
            direction="down",
            showactive=True,
            x=0.1,
            y=1.15
        )
    ],
    title="Score Difference Across Element Types",
    xaxis_title="Record Index",
    yaxis_title="Score Difference",
    showlegend=True
)

fig.show()
In [92]:
df_agree['score_difference'] = df_agree['similarity_score'] - df_agree['semscore']

# Calculate the required values
agree = ((df_agree['score_difference'] >= -0.01) & (df_agree['score_difference'] <= 0.01)).sum()
similarity = (1 - df_agree.loc[df_agree['score_difference'] > 0.01, 'score_difference']).sum()
semscore = (1 - df_agree.loc[df_agree['score_difference'] < 0.01, 'score_difference']).sum()

# Create a new DataFrame with the calculated values
summary_df = pd.DataFrame({
    'Metric': ['Agree', 'Similarity', 'Semscore'],
    'Value': [agree, similarity, semscore]
})

# Plot the histogram
plt.figure(figsize=(8, 6))
plt.bar(summary_df['Metric'], summary_df['Value'], color=['blue', 'green', 'red'])
plt.title('Histogram of Metrics')
plt.xlabel('Metrics')
plt.ylabel('Values')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
No description has been provided for this image
In [43]:
# Display the calculated values
summary_df
Out[43]:
Metric Value
0 Agree 32.000000
1 Similarity 361.177934
2 Semscore 148.063580
In [44]:
# Count the occurrences for each metric
agree_count = ((df_agree['score_difference'] >= -0.01) & (df_agree['score_difference'] <= 0.01)).sum()
similarity_count = (df_agree['score_difference'] > 0.01).sum()
semscore_count = (df_agree['score_difference'] < 0.01).sum()

# Create a new DataFrame with the counts
count_summary_df = pd.DataFrame({
    'Metric': ['Agree', 'Similarity', 'Semscore'],
    'Count': [agree_count, similarity_count, semscore_count]
})

# Plot the histogram for counts
plt.figure(figsize=(8, 6))
plt.bar(count_summary_df['Metric'], count_summary_df['Count'], color=['blue', 'green', 'red'])
plt.title('Histogram of Metric Counts')
plt.xlabel('Metrics')
plt.ylabel('Counts')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
No description has been provided for this image
In [45]:
# Display the calculated counts
count_summary_df
Out[45]:
Metric Count
0 Agree 32
1 Similarity 410
2 Semscore 140
In [46]:
# Define the 10% margin
margin = 0.1

# Compute agreement within the +/-10% margin
agreement_margin = ((df_agree['similarity_score'] >= (df_agree['semscore'] - margin)) &
                    (df_agree['similarity_score'] <= (df_agree['semscore'] + margin))).sum()

# Compute disagreement outside the +/-10% margin
disagreement_margin = len(df_agree) - agreement_margin

# Display the results
agreement_disagreement_summary = pd.DataFrame({
    'Metric': ['Agreement', 'Disagreement'],
    'Count': [agreement_margin, disagreement_margin]
})
In [47]:
agreement_disagreement_summary
Out[47]:
Metric Count
0 Agreement 306
1 Disagreement 244
In [48]:
# Compute proportional agreement within the ±10% margin
df_agree['agreement_proportion'] = 1 - (df_agree['similarity_score'] - df_agree['semscore']).abs() / margin
#df_agree['agreement_proportion'] = df_agree['agreement_proportion'].clip(lower=0)  # Clip negative values to 0

# Plot the proportional agreement series
plt.figure(figsize=(12, 6))
plt.plot(df_agree.index, df_agree['agreement_proportion'], marker='o', linestyle='-', label='Proportional Agreement')
plt.title('Proportional Agreement Series')
plt.xlabel('Record Index')
plt.ylabel('Agreement Proportion')
plt.grid(True)
plt.legend()
plt.show()
No description has been provided for this image

Agree that something is bad

In [49]:
# Define the threshold for "low" scores
low_threshold = 2 * 0.7  # 70% of the metrics combined

# Identify rows where both metrics are below the threshold
low_agreement_df = df_agree[
    ((df_agree['similarity_score'] + df_agree['semscore']) < low_threshold)
    #(df_agree['semscore'] < low_threshold)
]

# Display the filtered dataframe
low_agreement_df
Out[49]:
doc_id statement_id statement sources semscore similarity_score similarity_score_confidence transformation_accuracy grammar_syntax_accuracy findings element_type score_difference agreement_proportion
296 § 275.0-5 Order of the Commission An order issued by the Commission under the Act. [(d)] 0.810312 0.5 0.7 0.3 0.2 [The transformed sentence 'An order is by defi... Terms -0.310312 -2.103124
In [50]:
# Display the dataframe with the proportional agreement column
df_agree.sort_values('agreement_proportion', ascending=True)
Out[50]:
doc_id statement_id statement sources semscore similarity_score similarity_score_confidence transformation_accuracy grammar_syntax_accuracy findings element_type score_difference agreement_proportion
296 § 275.0-5 Order of the Commission An order issued by the Commission under the Act. [(d)] 0.810312 0.50 0.70 0.30 0.20 [The transformed sentence 'An order is by defi... Terms -0.310312 -2.103124
112 § 275.0-2 Managing agent Any person, including a trustee, who directs o... [(b)(1)] 0.654959 0.95 0.90 0.95 1.00 [The transformed sentence accurately reflects ... Terms 0.295041 -1.950409
224 § 275.0-2 Managing agent Any person, including a trustee, who directs o... [(b)(1)] 0.655024 0.95 0.90 0.95 1.00 [The transformed sentence accurately reflects ... Terms 0.294976 -1.949760
168 § 275.0-2 Managing agent Any person, including a trustee, who directs o... [(b)(1)] 0.655041 0.95 0.90 0.90 1.00 [The transformed sentence accurately reflects ... Terms 0.294959 -1.949594
308 § 275.0-2 Managing agent Any person, including a trustee, who directs o... [(b)(1)] 0.655048 0.95 0.90 0.90 1.00 [The transformed sentence accurately reflects ... Terms 0.294952 -1.949520
... ... ... ... ... ... ... ... ... ... ... ... ... ...
330 § 275.0-7 Right to vote A person is presumed to control a corporation ... [(b)(1)(iii), (b)(1)(i)(A)] 0.950956 0.95 0.95 0.90 1.00 [The transformed sentence closely mirrors the ... Terms -0.000956 0.990445
274 § 275.0-7 Right to vote A person is presumed to control a corporation ... [(b)(1)(iii), (b)(1)(i)(A)] 0.950775 0.95 0.95 0.90 1.00 [The transformed sentence closely mirrors the ... Terms -0.000775 0.992246
302 § 275.0-7 Right to vote A person is presumed to control a corporation ... [(b)(1)(iii), (b)(1)(i)(A)] 0.950740 0.95 0.95 0.90 0.95 [The transformed sentence closely mirrors the ... Terms -0.000740 0.992600
246 § 275.0-7 Right to vote A person is presumed to control a corporation ... [(b)(1)(iii), (b)(1)(i)(A)] 0.950586 0.95 0.95 0.90 1.00 [The transformed sentence maintains the origin... Terms -0.000586 0.994141
218 § 275.0-7 Right to vote A person is presumed to control a corporation ... [(b)(1)(iii), (b)(1)(i)(A)] 0.950586 0.95 0.95 0.90 1.00 [The transformed sentence maintains the origin... Terms -0.000586 0.994141

550 rows × 13 columns

Correlation analysis using Spearman, Kendall, and Pearson

Kendall

In [120]:
# Compute Kendall's Tau correlation to assess monotonicity
kendall_correlation, p_value_kendall = kendalltau(df_agree['similarity_score'], df_agree['semscore'])

kendall_correlation, p_value_kendall
Out[120]:
(-0.04888900614249123, 0.14007660419068518)

Spearman

In [119]:
# Check if the relationship between similarity_score and semscore is monotonic
# Compute Spearman's rank correlation to assess monotonicity
spearman_correlation, p_value = spearmanr(df_agree['similarity_score'], df_agree['semscore'])

spearman_correlation, p_value
Out[119]:
(-0.0678800931290666, 0.11180281052440856)

Pearson

In [118]:
# Calculate the correlation between similarity_score and semscore
correlation = df_agree['similarity_score'].corr(df_agree['semscore'])

correlation
Out[118]:
-0.10703687388311389

A correlation of -0.107 indicates a weak negative linear relationship between the variables, suggesting that as one variable slightly increases, the other tends to decrease marginally. However, the relationship is negligible, indicating little to no linear association. This weak correlation implies that changes in one variable do not reliably predict changes in the other. Furthermore, the low magnitude does not preclude the possibility of a non-linear relationship, which would require alternative methods of analysis for detection.

Prompt analysis¶

Analyze number of tokens from prompts and documents from last checkpoint using gpt-4o as a reference model.

According to OpenAI | models, the maximum number of tokens (context length) for gpt-4o is 128k.

The cost to use gpt-4o is 2.50 USD per 1m tokens in 2024-10-31. Source: OpenAI | pricing.

Extract elapse times and completions from all sessions.

In [53]:
managers, file_info_list = get_all_checkpoints(config["DEFAULT_CHECKPOINT_DIR"])
2024-12-13 20:58:03 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-1.json
2024-12-13 20:58:03 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-10.json
2024-12-13 20:58:03 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-2.json
2024-12-13 20:58:03 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-3.json
2024-12-13 20:58:03 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-4.json
2024-12-13 20:58:03 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-5.json
2024-12-13 20:58:03 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-6.json
2024-12-13 20:58:03 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-7.json
2024-12-13 20:58:03 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-8.json
2024-12-13 20:58:03 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-9.json
In [54]:
tokens_eval = {"doc_type": [], "elapsed_times": [], "completions": []}

for manager, file_info in zip(managers, file_info_list):
    # Process documents
    for key in manager.model_dump()["documents"].keys():
        if key[1].startswith("llm_"):
            doc = manager.retrieve_document(key[0], key[1])
            logger.info(f"Processing: {key[0]}, {key[1]}")
            elapsed_times = doc.elapsed_times
            logger.debug(f"Elapsed time: {elapsed_times}")
            completions = doc.completions
            logger.debug(f"Completions: {completions}")
            tokens_eval["doc_type"].append(key[1])
            tokens_eval["elapsed_times"].append(elapsed_times)
            tokens_eval["completions"].append(completions)
logger.info(f"Executions for evaluation: {len(tokens_eval['doc_type'])}")
2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: classify_P1, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Fact_Types, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Terms, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Names, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Operative_Rules, llm_validation
2024-12-13 20:58:03 - INFO - Processing: transform_Operative_Rules, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Fact_Types, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Terms, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Names, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: classify_P1, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: transform_Operative_Rules, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Fact_Types, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Terms, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Names, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Fact_Types, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Terms, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Names, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Operative_Rules, llm_validation
2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: classify_P1, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: transform_Operative_Rules, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Fact_Types, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Terms, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Names, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Fact_Types, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Terms, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Names, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Operative_Rules, llm_validation
2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: classify_P1, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: transform_Operative_Rules, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Fact_Types, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Terms, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Names, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Fact_Types, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Terms, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Names, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Operative_Rules, llm_validation
2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: classify_P1, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: transform_Operative_Rules, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Fact_Types, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Terms, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Names, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Fact_Types, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Terms, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Names, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Operative_Rules, llm_validation
2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: classify_P1, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: transform_Operative_Rules, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Fact_Types, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Terms, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Names, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Fact_Types, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Terms, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Names, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Operative_Rules, llm_validation
2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: classify_P1, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: transform_Operative_Rules, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Fact_Types, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Terms, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Names, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Fact_Types, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Terms, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Names, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Operative_Rules, llm_validation
2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: classify_P1, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: transform_Operative_Rules, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Fact_Types, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Terms, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Names, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Fact_Types, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Terms, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Names, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Operative_Rules, llm_validation
2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: classify_P1, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: transform_Operative_Rules, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Fact_Types, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Terms, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Names, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Fact_Types, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Terms, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Names, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Operative_Rules, llm_validation
2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-2_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-5_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P1, llm_response
2024-12-13 20:58:03 - INFO - Processing: § 275.0-7_P2, llm_response
2024-12-13 20:58:03 - INFO - Processing: classify_P1, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification
2024-12-13 20:58:03 - INFO - Processing: transform_Operative_Rules, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Fact_Types, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Terms, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: transform_Names, llm_response_transform
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Fact_Types, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Terms, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Names, llm_validation
2024-12-13 20:58:03 - INFO - Processing: validation_judge_Operative_Rules, llm_validation
2024-12-13 20:58:03 - INFO - Executions for evaluation: 190

Evaluate

In [55]:
# Constants
reference_models = config["REFERENCE_MODELS"]["MAX_CONTEXT_LENGTH"]
price_per_million_tokens = config["REFERENCE_MODELS"]["PRICE_PER_MILLION_TOKENS"]

# Initialize an empty list to store the raw data
raw_data = []

# Assuming tokens_eval is already defined and contains the necessary data
for doc_type, elapsed_times, completions in zip(
    tokens_eval["doc_type"], tokens_eval["elapsed_times"], tokens_eval["completions"]
):
    for elapsed_time, completion in zip(elapsed_times, completions):
        raw_data.append(
            (
                file_info["filename"],
                doc_type,
                elapsed_time,
                completion["usage"],
                completion["created"],
                completion["model"],
            )
        )

prompt_analysis(raw_data, config["DEFAULT_OUTPUT_DIR"])
Overall Statistics:
 Total Tokens  Number of Samples  Average Elapsed Time (s)  Estimated Cost (USD)  Average Percentage of Context Length (%) Min Created Timestamp Max Created Timestamp                      origin              run_at
      5472538               1210                  3.819458             13.681345                                  3.533405   2024-11-30 00:08:20   2024-12-09 02:11:07 documents-2024-12-08-9.json 2024-12-13 20:58:03

Statistics by Sample Type (doc_type):
                   doc_type  total_tokens  num_samples  average_elapsed_time  average_tokens  estimated_cost  average_percentage_context_length                    filename              run_at
               llm_response        272850           60             28.951583     4547.500000        0.682125                           3.552734 documents-2024-12-08-9.json 2024-12-13 20:58:03
llm_response_classification        370390           50              8.280727     7407.800000        0.925975                           5.787344 documents-2024-12-08-9.json 2024-12-13 20:58:03
     llm_response_transform       2480495          550              2.315991     4509.990909        6.201237                           3.523430 documents-2024-12-08-9.json 2024-12-13 20:58:03
             llm_validation       2348803          550              2.175670     4270.550909        5.872008                           3.336368 documents-2024-12-08-9.json 2024-12-13 20:58:03

Statistics by Model:
            model  total_tokens  num_samples  average_elapsed_time  average_tokens  average_percentage_context_length                    filename              run_at  estimated_cost      cost
gpt-4o-2024-08-06       5472538         1210              3.819458     4522.758678                           3.533405 documents-2024-12-08-9.json 2024-12-13 20:58:03       13.681345 13.681345

Additional Statistics:
 Average Completion Tokens  Average Prompt Tokens  Average Total Tokens per Sample  Total Elapsed Time (s)  Average Tokens per Second                      origin              run_at
                314.103306            4208.655372                      4522.758678             4621.544686                1959.897985 documents-2024-12-08-9.json 2024-12-13 20:58:03

Discussion¶

TODO